Prepare
library(tidyverse, warn.conflicts = F)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2, warn.conflicts = F)
library(purrr) # for functional programming
df <- read.csv("data/netflix_titles.csv", na.strings = c("", "NA"))
Grouping
by_country_type <- grouped %>%
group_by(country, type) %>%
summarise(cnt = sum(cnt))
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
Movie Distribution by country
plot_ly(by_country_type %>% filter(type == "Movie"),
type = "choropleth",
locations = ~country,
locationmode = "country names",
z = ~cnt,
color = ~cnt,
colors = "OrRd",
colorbar = list(title = "Counts"),
text = ~ paste(country, "<br>Counts: ", cnt)
)
TV Show Distribution by contry
plot_ly(by_country_type %>% filter(type == "TV Show"),
type = "choropleth",
locations = ~country,
locationmode = "country names",
z = ~cnt,
color = ~cnt,
colors = "OrRd",
colorbar = list(title = "Counts"),
text = ~ paste(country, "<br>Counts: ", cnt)
)
Accum TV shows & Movie
country_accum <- grouped %>%
group_by(country, year_added) %>%
summarise(cnt = sum(cnt)) %>%
group_by(country) %>%
arrange(year_added) %>%
mutate(accum = cumsum(cnt))
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
# TODO: dynamics graph over year
country_accum %>% filter(year_added == 2015) %>%
plot_ly(
type = "choropleth",
locations = ~country,
locationmode = "country names",
z = ~accum,
color = ~accum,
colors = "OrRd",
colorbar = list(title = "Accumlated Sum"),
text = ~ paste(country, "<br>Accumlated Sum: ", accum)
)
Gathering data
type_prop <- by_country_type %>%
group_by(country) %>%
mutate(prop = round(cnt / sum(cnt) * 100, 1)) %>%
group_by(country) %>%
summarise(total = sum(cnt), prop = prop, type = type) %>%
as.data.frame() %>%
top_n(20, wt = total)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
custom_order <- type_prop %>%
arrange(desc(ifelse(type == "Movie", prop, -prop))) %>%
select(country) %>%
array() %>%
flatten() %>%
unique()
ggplot(type_prop, aes(y = factor(country, levels = custom_order), x = prop, fill = type)) +
geom_bar(stat = "identity") +
geom_text(
aes(label = scales::percent(prop / 100)),
position = position_stack(vjust = 0.5),
color = "white",
size = 3
) +
labs(
title = "Proportions of Movie and TV Show by Country",
y = "Country",
x = "Proportion (%)",
fill = "Type"
) +
scale_x_continuous(labels = scales::percent_format(scale = 1), limits = c(0, 100)) +
scale_fill_manual(values = c("#221f1f", "#b20710")) +
theme_minimal()
